Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
file_path= './data/NCDC/uk/marham/dat.txt' 
# file_path= './data/NCDC/uk/tiree/dat.txt'  # try 4
# file_path= './data/NCDC/uk/boscombe_down/dat.txt' # 4?, numpy bug
# file_path= './data/NCDC/uk/middle_wallop/dat.txt' 
# file_path= './data/NCDC/uk/southhamption/dat.txt' # high 0, trend
# file_path= './data/NCDC/uk/bournemouth/dat.txt' # 4?
# file_path= "./data/NCDC/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?

# Iran
# file_path= './data/NCDC/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/chahbahar/dat.txt'

# Malaysia
# file_path= './data/NCDC/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/penang/dat.txt'
# file_path= './data/NCDC/butterworth/dat.txt' # 2 mode 

# France
# file_path = './data/NCDC/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= './data/NCDC/nantes/dat.txt' # some dir R square / K-S differs big, unit detect fails

# file_path= "./data/NCDC/southeast_asia/singapore_changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/singapore_seletar/dat.txt"
# file_path= "./data/NCDC/sultan_mahmud/dat.txt" # Stable
# file_path= "./data/NCDC/southeast_asia/sultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/europe/landsberg_lech/dat.txt" # very good, can try 4
# file_path= "./data/NCDC/europe/neuburg/dat.txt"
# file_path= "./data/NCDC/europe/valladolid/dat.txt"
# file_path= "./data/NCDC/europe/laupheim/dat.txt" # double peak, 4; very good, trend
# file_path= "./data/NCDC/europe/avord/dat.txt" # try 4, initial speed (should be good with m/s)
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/holzdorf/dat.txt" # 2008 year
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= "./data/NCDC/europe/barayas/dat.txt" # numpy problem
# file_path= "./data/NCDC/europe/vatry/dat.txt"  # double peak, initial speed (should be good with m/s), mixed report type
# file_path= './data/NCDC/europe/tenerife_sur/dat.txt'  # some directions are blocked

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

# file_path= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" 
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path = 'TOP/hr_avg.csv' # High 0
# file_path = './data/asos/denver/hr_avg.csv'
# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4?
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'
# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit

# file_path = './data/NDAWN/baker/hr_avg.csv' # 4 might be better
# file_path = './data/NDAWN/dickinson/hr_avg.csv'
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = False
else:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19730101 0000 FM-12 190 4.1 N
1 19730101 0100 FM-12 180 3.6 N
2 19730101 0300 FM-12 200 3.6 N
3 19730101 0400 FM-12 190 4.1 N
4 19730101 0500 FM-12 180 4.6 N
5 19730101 0600 FM-12 200 5.6 N
6 19730101 0700 FM-12 200 4.6 N
7 19730101 0800 FM-12 200 4.1 N
8 19730101 0900 FM-12 190 3.6 N
9 19730101 1000 FM-12 190 4.6 N
10 19730101 1100 FM-12 200 3.6 N
11 19730101 1200 FM-12 190 3.6 N
12 19730101 1300 FM-12 200 3.6 N
13 19730101 1400 FM-12 180 3.6 N
14 19730101 1500 FM-12 190 3.6 N
15 19730101 1600 FM-12 210 5.6 N
16 19730101 1700 FM-12 210 5.6 N
17 19730101 1800 FM-12 210 4.6 N
18 19730101 1900 FM-12 200 3.6 N
19 19730101 2000 FM-12 210 2.0 N
20 19730101 2200 FM-12 200 3.0 N
21 19730101 2300 FM-12 190 3.0 N
22 19730102 0000 FM-12 180 3.6 N
23 19730102 0100 FM-12 190 3.6 N
24 19730102 0200 FM-12 200 2.5 N
25 19730102 0300 FM-12 190 3.6 N
26 19730102 0400 FM-12 170 2.5 N
27 19730102 0500 FM-12 180 2.0 N
28 19730102 0600 FM-12 190 3.0 N
29 19730102 0700 FM-12 180 3.6 N
... ... ... ... ... ... ...
530644 20160801 0850 FM-15 290 2.6 N
530645 20160801 0900 FM-12 290 2.6 N
530646 20160801 0950 FM-15 260 2.6 N
530647 20160801 1000 FM-12 260 2.6 N
530648 20160801 1050 FM-15 250 2.6 N
530649 20160801 1150 FM-15 230 3.6 N
530650 20160801 1200 FM-12 230 3.6 N
530651 20160801 1250 FM-15 240 3.6 N
530652 20160801 1300 FM-12 240 3.6 N
530653 20160801 1350 FM-15 230 4.6 N
530654 20160801 1400 FM-12 230 4.6 N
530655 20160801 1450 FM-15 250 5.1 N
530656 20160801 1500 FM-12 250 5.1 N
530657 20160801 1550 FM-15 220 5.1 N
530658 20160801 1600 FM-12 220 5.1 N
530659 20160801 1650 FM-15 230 7.7 N
530660 20160801 1700 FM-12 230 7.7 N
530661 20160801 1750 FM-15 210 3.6 N
530662 20160801 1800 FM-12 210 3.6 N
530663 20160801 1850 FM-15 210 4.6 N
530664 20160801 1900 FM-12 210 4.6 N
530665 20160801 1950 FM-15 200 3.1 N
530666 20160801 2000 FM-12 200 3.1 N
530667 20160801 2050 FM-15 180 3.6 N
530668 20160801 2100 FM-12 180 3.6 N
530669 20160801 2150 FM-15 160 3.6 N
530670 20160801 2200 FM-12 160 3.6 N
530671 20160801 2250 FM-15 160 4.1 N
530672 20160801 2300 FM-12 160 4.1 N
530673 20160801 2350 FM-15 140 3.6 N

530674 rows × 6 columns

In [5]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [6]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [7]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
df.describe()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[7]:
date HrMn dir speed month dir_windrose
count 5.306410e+05 530641.000000 530641.000000 530641.000000 530641.000000 530641.000000
mean 1.997966e+07 1163.139273 204.537768 4.677674 6.474607 198.268426
std 1.231520e+05 689.317508 139.732192 2.748018 3.444669 140.666630
min 1.973010e+07 0.000000 0.000000 0.000000 1.000000 0.000000
25% 1.989041e+07 600.000000 120.000000 2.600000 3.000000 120.000000
50% 2.000091e+07 1150.000000 210.000000 4.100000 6.000000 200.000000
75% 2.009021e+07 1750.000000 260.000000 6.200000 9.000000 260.000000
max 2.016080e+07 2355.000000 999.000000 28.300000 12.000000 999.000000
In [8]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xe3999e8>

1.2.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'knot_unit' not in globals():
    knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
True
In [10]:
dir_unit_text = ' (degree)'
if 'knot_unit' == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.2.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.2.3 Sampling Time Selection

In [12]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
df = df.query("sample_time in @sample_times")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x18a4b828>

1.3 Data Wrangling

1.3.1 Artefacts

1.3.1.1 wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type month dir_windrose
time

1.3.1.2 sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1974-12-12 10:00:00 19741212 1000 FM-12 290 55 N 12 160 44.0 46.0
2002-01-28 17:00:00 20020128 1700 FM-12 210 54 N 1 240 27.0 27.0
1976-01-02 23:00:00 19760102 2300 FM-12 180 52 N 1 270 8.0 6.0
2001-12-28 14:00:00 20011228 1400 FM-12 170 51 N 12 280 25.0 30.0
1985-11-13 09:00:00 19851113 900 FM-12 230 50 N 11 220 40.0 45.0
2007-01-18 13:00:00 20070118 1300 FM-12 180 47 N 1 270 17.0 4.0
1976-01-03 00:00:00 19760103 0 FM-12 180 46 N 1 270 -6.0 7.0
1976-05-04 05:00:00 19760504 500 FM-12 220 44 N 5 230 30.0 31.0
1976-01-02 22:00:00 19760102 2200 FM-12 210 44 N 1 240 13.0 -8.0
2002-10-27 10:00:00 20021027 1000 FM-12 200 44 N 10 250 10.0 32.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xe3e8630>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 11
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1976-01-02 23:00:00 19760102 2300 FM-12 180 52 N 1 270 8.0 6.0
2007-01-18 13:00:00 20070118 1300 FM-12 180 47 N 1 270 17.0 4.0
1976-01-03 00:00:00 19760103 0 FM-12 180 46 N 1 270 -6.0 7.0
1976-01-02 22:00:00 19760102 2200 FM-12 210 44 N 1 240 13.0 -8.0
2002-10-27 10:00:00 20021027 1000 FM-12 200 44 N 10 250 10.0 32.0
1990-02-26 07:00:00 19900226 700 FM-12 220 43 N 2 230 20.0 18.0
2007-01-18 15:00:00 20070118 1500 FM-12 180 43 N 1 270 -4.0 3.0
2004-03-20 16:00:00 20040320 1600 FM-12 200 43 N 3 250 3.0 8.0
1977-12-24 09:00:00 19771224 900 FM-12 210 42 N 12 240 18.0 20.0
2003-04-27 12:00:00 20030427 1200 FM-12 200 41 N 4 250 22.0 17.0

1.3.2 0 Speed

In [17]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.00633458105384

1.3.3 Direction re-aligment and 999

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,30,50], need to redistribute the angle into 22.5

In [18]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       5849
10      4575
20      5878
30      5653
40      6483
50      6282
60      7761
70      6806
80      8109
90      7603
100     7273
110     5527
120     6387
130     5869
140     7875
150     7141
160     8244
170     7583
180     9856
190     8694
200    11517
210    11372
220    16077
230    14449
240    17047
250    14047
260    13836
270    10284
280    10460
290     8782
300     9611
310     7692
320     8203
330     6630
340     6245
350     4914
999     4703
Name: dir, dtype: int64
36 10.0
In [19]:
# df.query('date > 20100000 & dir == 999')['speed'].value_counts()
In [20]:
df=realign_direction(df, effective_column)
df=fill_direction_999(df, SECTOR_LENGTH)

1.4 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1973 - 1974
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1975 - 1979
1980 - 1984
1985 - 1989
1990 - 1994
1995 - 1999
2000 - 2004
2005 - 2009
2010 - 2014
2015 - 2016
In [24]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[24]:
(0, 16.0)
In [25]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type month dir_windrose
time
In [26]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.5 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.6 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: FM-12
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed month dir_windrose x y
count 4.362700e+04 43627.000000 43627.000000 43627.000000 43627.000000 43627.000000 43627.000000 43627.000000
mean 2.012063e+07 1149.480826 191.385116 9.426368 6.518395 185.404153 -1.739983 -2.091133
std 1.413450e+04 692.216184 95.093195 5.027427 3.449679 111.541762 6.975455 7.620759
min 2.010010e+07 0.000000 -4.988284 0.001616 1.000000 0.000000 -33.190758 -34.151522
25% 2.011040e+07 500.000000 111.487449 5.670094 4.000000 120.000000 -6.500068 -7.215563
50% 2.012063e+07 1100.000000 212.863971 8.683844 7.000000 190.000000 -1.182837 -2.356030
75% 2.013093e+07 1700.000000 264.155041 12.374514 10.000000 250.000000 3.236998 3.466685
max 2.014123e+07 2300.000000 354.944562 36.614390 12.000000 999.000000 21.135344 27.498769
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x190d7470>
In [34]:
# df['date'].apply(lambda x: str(x)[:-2]).value_counts().sort_index().plot(kind='bar', figsize=(20,4))
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x18ab3c50>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2. Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month

In [41]:
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration

In [42]:
SPEED_SET = array(list(zip(df.x, df.y)))
NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [43]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-19 -18 -17 -16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2
  -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19]
In [44]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [45]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [46]:
%%time
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV  ## too slow

# The bandwidth value sometimes would be too radical
if knot_unit:
    bandwidth_range = arange(0.7,2,0.2)
else:
    bandwidth_range = arange(0.4,1,0.1)

# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:    
    df_resample=df.sample(n=50000, replace=True)
    bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
    bandwidth_search_sample = sample

grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
1.7
Wall time: 2min 9s
In [47]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 1.7 1521
[  4.90002298e-05   6.49638686e-05   8.50114797e-05   1.07300827e-04
   1.26283991e-04]
In [48]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(4,3))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [49]:
kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization

In [50]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [51]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[51]:
weight mean_x mean_y sig_x sig_y corr
1 0.354 1.616 -4.389 4.870 4.824 0.042
2 0.332 0.346 4.997 6.116 5.123 0.021
3 0.314 -7.724 -6.994 5.961 6.915 -0.126
In [52]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.353783472943 [[ 1.61639683 -4.38920549]] [ 4.7423114   4.94904038] -51.3692407661
0.332027505201 [[ 0.34644089  4.9967924 ]] [ 5.11960475  6.11966026] -86.592369734
0.314189021856 [[-7.72421961 -6.99380757]] [ 5.79909349  7.05148427] -159.898772356
In [53]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [54]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[54]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.018 0.020 1.082357e-08 0.030 0.162

6. GMM by Optimization

In [55]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [56]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[56]:
     fun: -19.032341559314645
     jac: array([  9.81726885e-01,   0.00000000e+00,   4.76837158e-07,
        -2.38418579e-07,   2.38418579e-07,  -1.90734863e-06,
         9.81730461e-01,   0.00000000e+00,  -2.38418579e-07,
        -2.38418579e-07,  -2.38418579e-07,   2.62260437e-06,
         9.81728554e-01,   2.38418579e-07,   4.76837158e-07,
         2.38418579e-07,   0.00000000e+00,  -9.53674316e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 1074
     nit: 53
    njev: 53
  status: 0
 success: True
       x: array([  5.14446968e-01,  -3.35228633e+00,  -6.70615733e+00,
         7.79396261e+00,   5.94852991e+00,   9.12630910e-02,
         3.75103979e-01,  -1.20133003e-05,   4.62986400e+00,
         7.08143310e+00,   4.78652622e+00,   1.21798194e-01,
         1.10449053e-01,   1.72489470e+00,  -3.74538237e+00,
         3.88683506e+00,   2.96670349e+00,   3.58765405e-01])

6.1 GMM Result

In [57]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.514 -3.352 -6.706 7.794 5.949 0.091
2 0.375 -0.000 4.630 7.081 4.787 0.122
3 0.110 1.725 -3.745 3.887 2.967 0.359
In [58]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.514446967928 [[-3.35228633 -6.70615733]] [ 5.89047564  7.83793072] -80.7736034099
0.375103979271 [[ -1.20133003e-05   4.62986400e+00]] [ 4.72216089  7.12451574] -81.5674081612
0.110449052801 [[ 1.7248947  -3.74538237]] [ 2.59862288  4.14197724] -63.6568489125

6.2 Goodness-of-fit statistics

In [59]:
gof_df(gmm_pdf_result, kde_result)
Out[59]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.024 5.424492e-09 0.021 0.115
In [60]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [61]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
In [62]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
In [63]:
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
Speed Distribution Comparison
In [64]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print(title)
Direction Distribution Comparison
In [65]:
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.905218444225

6.3 Sectoral Comaprison

In [66]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(10,1.9))
#         fig = plt.figure(figsize=(10,1.7))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
# #         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], x[diff_weibull.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection
In [67]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 1058 weight 0.024251037201732873
GMM Weibull
R square 0.909002856793 0.964700673847
max diff: 0.0934343796501 0.0162674458321 speed value: 11.0379330073 8.83034640581 y gmm 0.753446527722
 
25.0 (15.0 - 35.0) degree
data size: 1789 weight 0.04100671602448025
GMM Weibull
R square 0.841421563997 0.978437794225
max diff: 0.121619394942 0.0145498331704 speed value: 10.689295119 8.55143609522 y gmm 0.648084350167
 
45.0 (35.0 - 55.0) degree
data size: 2002 weight 0.0458890136841864
GMM Weibull
R square 0.894586796112 0.977092188141
max diff: 0.0931412504816 0.0287184588738 speed value: 11.4476302075 9.15810416597 y gmm 0.699566042226
 
65.0 (55.0 - 75.0) degree
data size: 2170 weight 0.049739840007334904
GMM Weibull
R square 0.94265434108 0.953865918473
max diff: 0.0334882918899 0.0304701626572 speed value: 6.06346855058 6.06346855058 y gmm 0.293239818709
 
85.0 (75.0 - 95.0) degree
data size: 2057 weight 0.0471497008733124
GMM Weibull
R square 0.871622073359 0.904812217113
max diff: 0.0618918676421 0.0477465367133 speed value: 12.2798175592 6.82212086622 y gmm 0.852363428167
 
105.0 (95.0 - 115.0) degree
data size: 1777 weight 0.04073165700139822
GMM Weibull
R square 0.921870442374 0.914795666786
max diff: 0.048596114644 0.038593133469 speed value: 14.5347143057 7.26735715283 y gmm 0.946739052179
 
125.0 (115.0 - 135.0) degree
data size: 1540 weight 0.035299241295528
GMM Weibull
R square 0.902621581879 0.927609571121
max diff: 0.0831293978299 0.0267907758396 speed value: 12.2565588537 8.17103923581 y gmm 0.840921605622
 
145.0 (135.0 - 155.0) degree
data size: 2037 weight 0.04669126916817567
GMM Weibull
R square 0.925436795204 0.956342353082
max diff: 0.0552855866893 0.0544679565598 speed value: 12.1398979518 8.09326530117 y gmm 0.789584320036
 
165.0 (155.0 - 175.0) degree
data size: 2028 weight 0.046484974900864146
GMM Weibull
R square 0.936360140698 0.950426673143
max diff: 0.0480750669377 0.0416460029833 speed value: 4.94421188545 9.88842377089 y gmm 0.190579997904
 
185.0 (175.0 - 195.0) degree
data size: 2424 weight 0.055561922662571345
GMM Weibull
R square 0.931196202515 0.958462585875
max diff: 0.0416544648859 0.0527172225449 speed value: 6.50994236719 11.3923991426 y gmm 0.260713870826
 
205.0 (195.0 - 215.0) degree
data size: 2915 weight 0.06681642102367799
GMM Weibull
R square 0.954782769695 0.977037934997
max diff: 0.0482681190537 0.0381609035626 speed value: 13.4895121854 13.4895121854 y gmm 0.623086940981
 
225.0 (215.0 - 235.0) degree
data size: 4440 weight 0.10177183854035345
GMM Weibull
R square 0.965891998456 0.97309362379
max diff: 0.0209518859045 0.0281900349898 speed value: 7.14908435793 16.0854398053 y gmm 0.227933867887
 
245.0 (235.0 - 255.0) degree
data size: 4422 weight 0.1013592500057304
GMM Weibull
R square 0.977158823595 0.978855133426
max diff: 0.0215633692443 0.0265792618111 speed value: 9.27264317037 9.27264317037 y gmm 0.418735138218
 
265.0 (255.0 - 275.0) degree
data size: 3553 weight 0.0814403924175396
GMM Weibull
R square 0.93665820165 0.959848216654
max diff: 0.0545453873177 0.017966549793 speed value: 7.31762939866 1.82940734966 y gmm 0.349057202043
 
285.0 (275.0 - 295.0) degree
data size: 2924 weight 0.06702271529098952
GMM Weibull
R square 0.94003367173 0.952003102425
max diff: 0.0542056218746 0.0254426081528 speed value: 6.3784665038 6.3784665038 y gmm 0.331225294678
 
305.0 (295.0 - 315.0) degree
data size: 2550 weight 0.05845004240493273
GMM Weibull
R square 0.940618402175 0.958148835857
max diff: 0.0551971659022 0.0192303477169 speed value: 11.8613767814 2.63586150699 y gmm 0.777351853706
 
325.0 (315.0 - 335.0) degree
data size: 2112 weight 0.0484103880624384
GMM Weibull
R square 0.962935248997 0.964430167851
max diff: 0.0476157282566 0.024615497399 speed value: 5.93384020427 8.30737628597 y gmm 0.361948665683
 
345.0 (335.0 - 355.0) degree
data size: 1494 weight 0.03424484837371353
GMM Weibull
R square 0.932490874235 0.953487124535
max diff: 0.0689557379042 0.0997999099075 speed value: 12.060822946 5.4821922482 y gmm 0.842021504398
 
Wall time: 59.4 s
In [68]:
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9346009046583896 0.9587689582481885
In [69]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.05206628959714735 0.03327163240483782
In [70]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [71]:
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle
In [72]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [73]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution
0.133408102215 10.0 0.599962495885

6.4.2 Time Variability

In [74]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [75]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [76]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
25.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [77]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
speed_unit_text=' (knot)'

7.1 Variability of the Result

In [78]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.806 -2.670 -2.600 7.540 8.462 0.174
2 0.112 1.683 -4.222 3.955 3.129 0.342
3 0.082 3.023 5.058 4.500 3.037 -0.044
GMM Plot Result
0.806167686376 [[-2.66995508 -2.60007   ]] [ 7.13470405  8.80644101] 151.808581054
0.112285230399 [[ 1.68275563 -4.22191533]] [ 2.75248165  4.22594311] -62.3359021811
0.0815470832249 [[ 3.02301287  5.05760789]] [ 3.03143628  4.50373745] -93.0936592904
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.012 0.011 5.451089e-09 0.022 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.012 0.010 5.400547e-09 0.021 0.115

weight mean_x mean_y sig_x sig_y corr
1 0.793 -2.793 -2.756 7.559 8.345 0.157
2 0.121 1.657 -4.062 4.221 3.251 0.446
3 0.085 2.887 5.335 4.563 3.061 -0.003
GMM Plot Result
0.793387076865 [[-2.79260505 -2.75600085]] [ 7.18845852  8.66630027] 151.130709775
0.121309403394 [[ 1.65736508 -4.06223629]] [ 2.65983487  4.61635767] -60.3053935383
0.0853035197409 [[ 2.88684922  5.33489007]] [ 3.0607359   4.56252413] -90.1832090587
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.011 0.011 5.459621e-09 0.022 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.010 0.011 5.414990e-09 0.021 0.115

weight mean_x mean_y sig_x sig_y corr
1 0.511 -3.601 -6.731 7.690 6.076 0.080
2 0.384 0.193 4.470 7.017 4.889 0.067
3 0.106 1.723 -3.859 3.801 2.821 0.273
GMM Plot Result
0.510535961612 [[-3.60099746 -6.73089443]] [ 6.02562533  7.72949068] -80.7045502788
0.383733761643 [[ 0.19308916  4.46968198]] [ 4.86749967  7.03184803] -84.8769496593
0.105730276745 [[ 1.72327271 -3.85892544]] [ 2.61394789  3.94683753] -68.9616043215
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.013 0.021 5.640042e-09 0.022 0.117
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.013 0.020 5.594148e-09 0.022 0.117

weight mean_x mean_y sig_x sig_y corr
1 0.549 -3.444 -6.154 7.784 6.322 0.077
2 0.334 0.554 5.006 6.719 4.600 0.071
3 0.117 1.707 -3.760 4.011 3.064 0.350
GMM Plot Result
0.549169436369 [[-3.44377357 -6.15442316]] [ 6.26894924  7.82734557] -79.9246991753
0.334142834851 [[ 0.55389557  5.00554872]] [ 4.57791455  6.73393979] -84.8010202688
0.116687728779 [[ 1.7071831  -3.75985956]] [ 2.69951144  4.26478529] -63.9788143365
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.991 0.013 0.025 5.207476e-09 0.021 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.025 5.739000e-09 0.022 0.118

weight mean_x mean_y sig_x sig_y corr
1 0.813 -2.586 -2.601 7.644 8.439 0.172
2 0.113 1.753 -3.967 3.935 3.226 0.399
3 0.074 2.689 5.239 4.550 2.806 -0.019
GMM Plot Result
0.813143210508 [[-2.58598934 -2.60097206]] [ 7.21081881  8.81153014] 149.957216885
0.112536633338 [[ 1.75317948 -3.96668879]] [ 2.698145    4.31398826] -58.2961543737
0.074320156154 [[ 2.68898217  5.23922235]] [ 2.80549673  4.55004306] -91.0993987937
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.011 5.698907e-09 0.022 0.118
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.011 5.356152e-09 0.021 0.114

weight mean_x mean_y sig_x sig_y corr
1 0.519 -3.170 -6.696 7.879 5.878 0.108
2 0.377 -0.014 4.650 7.164 4.875 0.127
3 0.104 1.673 -3.628 3.695 2.931 0.362
GMM Plot Result
0.518698214497 [[-3.16961563 -6.69600098]] [ 5.80315888  7.93445823] -80.0333105721
0.377342460837 [[-0.01351989  4.65041931]] [ 4.80308248  7.21216995] -81.092392377
0.103959324666 [[ 1.67347475 -3.62801379]] [ 2.54036927  3.97304098] -61.4218778158
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.025 5.384354e-09 0.021 0.115
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.012 0.023 5.569039e-09 0.022 0.117

weight mean_x mean_y sig_x sig_y corr
1 0.496 -3.160 -6.969 7.842 5.814 0.122
2 0.389 -0.615 4.533 7.299 4.795 0.174
3 0.115 1.769 -3.538 3.922 3.123 0.399
GMM Plot Result
0.496023278137 [[-3.15999834 -6.96881766]] [ 5.72084953  7.9105805 ] -79.0618188243
0.388818443078 [[-0.61505333  4.53253897]] [ 4.67139819  7.37944315] -79.0625034957
0.115158278786 [[ 1.76877562 -3.53827543]] [ 2.63220057  4.26686819] -59.9748319404
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.015 0.029 5.208728e-09 0.021 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.026 5.675271e-09 0.022 0.118

weight mean_x mean_y sig_x sig_y corr
1 0.489 -3.364 -7.181 7.700 5.746 0.121
2 0.412 -0.123 4.272 7.206 4.952 0.121
3 0.099 1.664 -3.850 3.626 2.666 0.300
GMM Plot Result
0.489117797091 [[-3.36422302 -7.18091698]] [ 5.65415722  7.76742686] -78.9113697029
0.412129603492 [[-0.12299329  4.27234735]] [ 4.88446273  7.25247981] -81.2302815396
0.0987525994162 [[ 1.6640779  -3.85018714]] [ 2.43808386  3.78363292] -68.0951523841
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.013 0.019 5.265044e-09 0.021 0.113
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.022 5.678930e-09 0.022 0.118

weight mean_x mean_y sig_x sig_y corr
1 0.500 -3.300 -6.851 7.899 5.802 0.104
2 0.382 -0.216 4.592 7.260 4.783 0.171
3 0.118 1.671 -3.705 4.033 3.094 0.418
GMM Plot Result
0.500238249146 [[-3.29969093 -6.85098243]] [ 5.73453478  7.9483782 ] -80.7902175357
0.381553982503 [[-0.21592882  4.59165241]] [ 4.66220294  7.33762152] -79.1557030841
0.118207768351 [[ 1.67056506 -3.70529323]] [ 2.59277725  4.37206034] -61.3441178272
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.989 0.015 0.027 5.791423e-09 0.022 0.119
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.014 0.028 5.568005e-09 0.022 0.117

weight mean_x mean_y sig_x sig_y corr
1 0.458 -3.029 -7.284 7.876 5.608 0.144
2 0.419 -0.729 4.262 7.398 5.052 0.220
3 0.123 1.644 -3.628 4.212 3.191 0.448
GMM Plot Result
0.458363831984 [[-3.0287328  -7.28412872]] [ 5.4943295   7.95547522] -78.7247796681
0.418841250567 [[-0.72874054  4.26169178]] [ 4.83407513  7.54275652] -75.3055806761
0.122794917449 [[ 1.64435029 -3.6282872 ]] [ 2.61737787  4.59023959] -61.0452877792
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.013 0.023 5.493028e-09 0.022 0.116
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.990 0.012 0.024 5.706451e-09 0.022 0.118
Wall time: 29.8 s

7.2 Cross-validation, to select the number of Gaussian

In [79]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset 32720.25 10906.75
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.027866 0.040965 2.957004e-08 0.049296 0.268495 0.946636
1 0.028546 0.041356 3.060776e-08 0.050956 0.273103 0.944188
2 0.027115 0.039799 2.791368e-08 0.049199 0.261019 0.949352
3 0.027582 0.039899 2.947915e-08 0.049601 0.267930 0.947295
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.029047 0.040563 3.002430e-08 0.052072 0.270493 0.945537
1 0.027625 0.038148 2.605420e-08 0.046397 0.252147 0.954202
2 0.031890 0.044707 3.502702e-08 0.052112 0.291650 0.937537
3 0.034181 0.038849 3.105764e-08 0.051505 0.275575 0.942077
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.022432 0.028435 1.057536e-08 0.029884 0.160525 0.980851
1 0.022492 0.029110 1.090282e-08 0.030273 0.162969 0.980284
2 0.021568 0.028171 1.048385e-08 0.029640 0.159956 0.981133
3 0.021401 0.027779 1.043366e-08 0.029743 0.159476 0.981100
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.026921 0.030932 1.209698e-08 0.031828 0.171832 0.978290
1 0.024234 0.031707 1.133273e-08 0.031023 0.166383 0.979587
2 0.024143 0.024496 1.243758e-08 0.032578 0.173819 0.977261
3 0.024963 0.028109 1.266909e-08 0.032377 0.175747 0.977294
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.025132 0.014906 5.302479e-09 0.021190 0.113764 0.990390
1 0.024060 0.013486 5.213517e-09 0.021048 0.112707 0.990553
2 0.023014 0.013211 5.739184e-09 0.021975 0.118159 0.989673
3 0.024292 0.013740 5.712278e-09 0.021813 0.118075 0.989681
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.022582 0.011994 6.941909e-09 0.023943 0.129836 0.987565
1 0.028041 0.014539 7.531232e-09 0.024882 0.135588 0.986507
2 0.025480 0.015091 6.984588e-09 0.024314 0.130886 0.987239
3 0.034689 0.017976 6.632601e-09 0.024063 0.126921 0.988014
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.017072 0.012455 2.082419e-09 0.013333 0.071245 0.996241
1 0.008996 0.015646 2.935405e-09 0.015701 0.084570 0.994692
2 0.010405 0.015389 2.967809e-09 0.015632 0.085065 0.994623
3 0.009665 0.015371 3.050258e-09 0.016145 0.086245 0.994492
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.025153 0.016014 5.204072e-09 0.020545 0.112646 0.990569
1 0.010799 0.015713 3.496224e-09 0.017255 0.092384 0.993690
2 0.011725 0.015663 3.843168e-09 0.018575 0.096761 0.993109
3 0.013316 0.014271 3.896271e-09 0.017459 0.097404 0.992946
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.012789 0.007293 1.435976e-09 0.010926 0.059151 0.997415
1 0.007714 0.006536 1.840280e-09 0.012381 0.067013 0.996673
2 0.004961 0.005649 1.304434e-09 0.010534 0.056386 0.997637
3 0.008136 0.014338 1.743229e-09 0.012217 0.065172 0.996844
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.017697 0.008672 4.282890e-09 0.019365 0.102246 0.992186
1 0.010749 0.008033 3.237649e-09 0.016810 0.088699 0.994164
2 0.009239 0.012637 4.254209e-09 0.018672 0.101855 0.992374
3 0.015137 0.015012 2.902275e-09 0.015209 0.084172 0.994785
Wall time: 1min 14s
In [80]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.027777 0.040505 2.939266e-08 0.049763 0.267637 0.946867
2 0.021973 0.028374 1.059892e-08 0.029885 0.160732 0.980842
3 0.024125 0.013836 5.491864e-09 0.021507 0.115676 0.990074
4 0.011534 0.014715 2.758973e-09 0.015203 0.081781 0.995012
5 0.008400 0.008454 1.580980e-09 0.011515 0.061930 0.997142
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.030686 0.040567 3.054079e-08 0.050521 0.272466 0.944838
2 0.025065 0.028811 1.213409e-08 0.031952 0.171945 0.978108
3 0.027698 0.014900 7.022582e-09 0.024301 0.130807 0.987331
4 0.015248 0.015415 4.109934e-09 0.018458 0.099799 0.992579
5 0.013206 0.011089 3.669256e-09 0.017514 0.094243 0.993377
In [81]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [82]:
# fig = plt.figure(figsize=(4.3,2.4))
fig = plt.figure(figsize=(5,2.5))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)